{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Evaluation of oversamplers with a set of classifiers on one database\n",
    "\n",
    "In this notebook we give an example of optimizing oversamplers and classifiers for given dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os.path\n",
    "\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "\n",
    "import smote_variants as sv\n",
    "\n",
    "import imbalanced_databases as imbd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# the evaluation procedure uses a directory for caching\n",
    "\n",
    "cache_path= os.path.join(os.path.expanduser('~'), 'smote_test')\n",
    "\n",
    "if not os.path.exists(cache_path):\n",
    "    os.makedirs(cache_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# specifying the dataset to be used\n",
    "\n",
    "dataset= imbd.load_glass0()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# specifying the classifiers\n",
    "\n",
    "knn_classifier= KNeighborsClassifier()\n",
    "dt_classifier= DecisionTreeClassifier()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2019-06-11 18:18:38,524:INFO:dataset: glass0, samplings_available: True, evaluations_available: True\n",
      "2019-06-11 18:18:38,524:INFO:doing the folding\n",
      "2019-06-11 18:18:38,525:INFO:Folding reading from file folding_glass0.pickle\n",
      "2019-06-11 18:18:38,529:INFO:do the samplings\n",
      "2019-06-11 18:18:38,529:INFO:create sampling objects\n",
      "2019-06-11 18:18:38,532:INFO:executing 72 sampling in parallel\n",
      "2019-06-11 18:19:15,450:INFO:do the evaluations\n",
      "2019-06-11 18:19:15,451:INFO:create classifier jobs\n",
      "2019-06-11 18:19:15,487:INFO:executing 72 evaluation jobs in parallel\n",
      "2019-06-11 18:19:15,934:INFO:concatenating the results\n",
      "2019-06-11 18:19:16,212:INFO:aggregating the results\n"
     ]
    }
   ],
   "source": [
    "# executing the evaluation using 5 parallel jobs and at most 35 random but meaningful parameter combinations\n",
    "# with the 5 quickest oversamplers\n",
    "\n",
    "results= sv.evaluate_oversamplers(datasets= [dataset],\n",
    "                                    samplers= sv.get_n_quickest_oversamplers(5),\n",
    "                                    classifiers= [knn_classifier, dt_classifier],\n",
    "                                    cache_path= cache_path,\n",
    "                                    n_jobs= 5,\n",
    "                                    max_samp_par_comb= 35)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# determining oversampler and classifier combination with highest AUC score\n",
    "\n",
    "highest_auc_score= results['auc'].idxmax()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# querying classifier and oversampler parameters with highest AUC score\n",
    "\n",
    "cl, cl_par, samp, samp_par= results.loc[highest_auc_score][['classifier',\n",
    "                                                           'classifier_parameters_auc',\n",
    "                                                           'sampler',\n",
    "                                                           'sampler_parameters_auc']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# instantiating oversampler and classifier objects providing the highest AUC score\n",
    "\n",
    "samp_obj= getattr(sv, samp)(**eval(samp_par))\n",
    "cl_obj= eval(cl)(**eval(cl_par))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2019-06-11 18:19:16,995:INFO:SMOTE_D: Running sampling via ('SMOTE_D', \"{'proportion': 1.5, 'k': 5, 'n_jobs': 1}\")\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
       "           metric_params=None, n_jobs=None, n_neighbors=5, p=2,\n",
       "           weights='uniform')"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# oversampling the entire dataset and fitting a classifier\n",
    "\n",
    "X_samp, y_samp= samp_obj.sample(dataset['data'], dataset['target'])\n",
    "cl_obj.fit(X_samp, y_samp)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
